{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import pandas as pd\n",
    "DATA_DIR = \"/data/data/fintuning/wanjuan_exam/part-003756-a894b46e.jsonl\"\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import re\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def format_example(item, cot=False):\n",
    "    example = item[\"q_main\"] + \"\\n\"\n",
    "    choice_map = {\"A\":\"option_a\", \"B\":\"option_b\", \"C\":\"option_c\", \"D\":\"option_d\"} \n",
    "    for choice in [\"A\",\"B\",\"C\",\"D\"]:\n",
    "        if item[choice_map[choice]] != \"\":\n",
    "            example += f'{choice}. {item[choice_map[choice]]}\\n'\n",
    "    # 非cot\n",
    "    if not cot:\n",
    "        answer = f'{item[\"std_ans\"]}。{item[\"answer_detail\"]}'\n",
    "    # cot\n",
    "    else:\n",
    "        answer = f'{item[\"answer_detail\"]}'\n",
    "        pattern = r\"故选.*\"\n",
    "        answer = re.sub(pattern, '', answer, flags=re.DOTALL)\n",
    "        answer = answer + f'答案为{item[\"std_ans\"]}'\n",
    "    return example, answer\n",
    "\n",
    "all_data = []\n",
    "no_explain_counter = 0\n",
    "cot = True\n",
    "with open(DATA_DIR, 'r') as file:\n",
    "    for idx, line in enumerate(file):\n",
    "        # 解码JSON对象\n",
    "        json_obj = json.loads(line)\n",
    "        if json_obj[\"std_ans\"]==\"\" or len(json_obj[\"answer_detail\"])<10:\n",
    "            # 200w数据这块continue\n",
    "            choice, answer = format_example(json_obj, False)\n",
    "            instruct = f\"以下是一道单选题:\\n{choice}请给出答案。\\n\"\n",
    "            a_data = {\n",
    "                    \"instruction\": instruct,\n",
    "                    \"input\": \"\",\n",
    "                    \"output\": answer\n",
    "                }\n",
    "            all_data.append(a_data)\n",
    "        else:\n",
    "            # 处理json_obj\n",
    "            choice, answer = format_example(json_obj, cot)\n",
    "            if not cot:\n",
    "                instruct = f\"以下是一道单选题:\\n{choice}请给出答案。\\n\"\n",
    "            # cot\n",
    "            else:\n",
    "                instruct = f\"以下是一道{json_obj['q_type']}:\\n{choice}请先给出解释再给出答案。\\n\"\n",
    "            a_data = {\n",
    "                    \"instruction\": instruct,\n",
    "                    \"input\": \"\",\n",
    "                    \"output\": answer\n",
    "                }\n",
    "            # print(a_data)\n",
    "            all_data.append(a_data)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3997183\n"
     ]
    }
   ],
   "source": [
    "print(len(all_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filter before: 3997183\n",
      "filter after: 3997183\n"
     ]
    }
   ],
   "source": [
    "print(\"filter before:\", len(all_data))\n",
    "filted_all_data = [x for x in all_data if \"略\" not in x[\"output\"]]\n",
    "print(\"filter after:\", len(all_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'instruction': '以下是一道单选题:\\n三位同学在一起讨论有关科技革命的话题，以下是他们谈话内容的片断：他们讨论这一话题的中心是（ ）  \\n甲：科技革命成果的商品化周期只需要3～5年  \\n乙：新技术之间联系密切，相互促进  \\n丙：在经济增长的因素中，科技进步的因素占80%\\nA. 第三次科技革命的影响\\nB. 第三次科技革命的特点\\nC. 第三次科技革命的成就\\nD. 第三次科技革命的内容\\n请给出答案。\\n', 'input': '', 'output': 'B。根据所学知识，第三次科技革命与前两次科技革命相比，有下列特点：首先，这次科技革命不仅涌现了大量的科学成果，而且大大加快了科学技术转化为生产力的速度，缩短了知识变为物质财富的过程。其次，科学技术的各个领域之间相互渗透，一种技术的发展引起好几种技术的革命。第三，新技术成为社会生产力中最活跃的因素，在促进经济增长的各种因素中，科技进步所占的比重不断上升。甲、乙、丙同学的谈话都与第三次科技革命的特点有关。故选B  \\n【点评】本题主要考查第三次科技革命。本题既考查了第三次科技革命的特点，也考查了学生提取有效信息的能力；学生应注意对三次科技革命的特点进行对比掌握。'}\n"
     ]
    }
   ],
   "source": [
    "print(filted_all_data[1000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200000\n",
      "{'instruction': '以下是一道单选题:\\n【题文】神经系统中，专门调节心跳、血压等重要生命活动的神经传导中枢在：\\nA. 大脑皮层\\nB. 小脑\\nC. 脑干\\nD. 脊髓\\n请给出答案。\\n', 'input': '', 'output': 'C。  \\n试题分析：A、大脑由左右两个半球组成，大脑皮层的表面是灰质，称为大脑皮层，它是调节人体生理活动的最高级中枢，有躯体运动中枢、躯体感觉中枢、视觉中枢、听觉中枢等，B、小脑的主要功能是协调运动，维持身体的平衡，D、脊髓位于脊柱的椎管内，包括灰质和白质，具有反射和传导的功能，故A、B、D都不符合题意；  \\nC、脑干中有能够调节人体基本生命活动的中枢，如呼吸中枢、心血管运动中枢等，一旦损伤会危及生命，所以脑干被称为“生命中枢”，故符合题意。  \\n考点：本题考查的是神经系统的功能，解答此类题目的关键是熟记神经系统的功能。'}\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "few_data = random.sample(filted_all_data, 200000)\n",
    "print(len(few_data))\n",
    "print(few_data[100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "json_dir = \"/data/data/fintuning/llamafactory_input/wanjuan_exam_390w.jsonl\"\n",
    "with open(json_dir, 'w', encoding=\"utf-8\") as f:\n",
    "    for item in filted_all_data:\n",
    "        json.dump(item, f, ensure_ascii=False)\n",
    "        f.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3837404\n"
     ]
    }
   ],
   "source": [
    "print(filted_all_data[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
